## ---------------------------
##
## Script name: process_demographics_Nation.R
##
## Description: Gathers demographic variables for 100 cities. Uses the
## 'tidycensus' package to do this. Demographics for 2014
## and 2019 are loaded, and then the years in between are
## imputed based on the change between those years. The 
## year 2020 is assigned the same values as 2019.
##
## ---------------------------
## DEPENDENCIES

library("purrr")
library("tidyverse")
library("tidycensus")
library(sf)

census_api_key(census_key, overwrite = TRUE, install = TRUE)

# list of 100 cities to be included
city_list <- read.csv(paste(file_locations$current_fp, file_locations$Helpers$`City List`, sep="/"))

## ---------------------------


process_demographics_nation <- function(file_locations, population_cutoff, overwrite=F) {
  #'
  #'@description Gets ACS data for 2014 and 2019, binds them
  #'together, and fills in missing population for years in
  #'between.
  #'
  #'@param file_locations list. File locations loaded from
  #'file_locations.R
  #'@param overwrite boolean. Determines if already processed 
  #'file should be overwritten.
  #'
  #'@return saves the clean output and returns NULL
  
  # a. Demographics for end of year range (2020 not yet available)
  demos_2019 <- .gather_yearly_demographics_data(file_locations, 2019, overwrite=overwrite)
  
  # b. Demographics for beginning of year range
  demos_2014 <- .gather_yearly_demographics_data(file_locations, 2014, overwrite=overwrite)
  
  # c. Combine and impute data for years in between
  output_fp <- paste(file_locations$current_fp, file_locations$Nation$Demographics$processed, sep="/")
  if (!file.exists(output_fp) | overwrite) {
  
    # a. combine beginning and end of year range demographics
    demos_all <- rbind(demos_2019 %>% rename(tract_2010=tract_2019),
                       demos_2014 %>% rename(tract_2010=tract_2014)) %>%
      mutate(total_below_pov_level = ifelse(total_pov_responses>0, (total_population*below_pov_level)/total_pov_responses, 0))
    
    # b. fill in the years in between with imputed data
    demos_all <- .fill_missing_population(demos_all, id_col=c("tract_2010", "city"), 
                                          cols_to_impute=c("total_population","total_population_white","total_population_black","total_population_hisp",
                                                           "total_population_asian", "total_population_pacisl", "total_population_otherrace",
                                                           "total_below_pov_level"),
                                          years_to_include=seq(2014,2020))
    
    # Round populations to whole number
    demos_all <- demos_all %>%
      mutate_at(vars(contains('total')), ~ round(., 0))

    demos_all <- demos_all %>%
      group_by(city, tract_2010) %>%
      mutate(lowest_pop_value = min(total_population)) %>%
      filter(lowest_pop_value >= population_cutoff) %>%
      select(-lowest_pop_value)
    
    write.csv(demos_all, output_fp,
              row.names = F)
  }
  
}


### GATHER ACS VARIABLES FOR ALL CITIES

.gather_yearly_demographics_data <- function(file_locations, year, overwrite=F) {
  #'
  #'@description Gets ACS data for 2014 and 2019, binds them
  #'together, and fills in missing population for years in
  #'between.
  #'
  #'@param file_locations list. File locations loaded from
  #'file_locations.R
  #'@param year integer. End Year of 5-year ACS to load.
  #'@param overwrite boolean. Determines if already processed 
  #'file should be overwritten.
  #'
  #'@return dataframe of population values for given end year


  output_fp <- paste(file_locations$current_fp, file_locations$Nation$Demographics$raw[[as.character(year)]], sep="/")
    
  if (overwrite | !file.exists(output_fp)) {
  
    city_demos_list <- list()
    for (row in 1:nrow(city_list)) {
      
      state_abr <- city_list[row,]$state_abr
      city <- city_list[row,]$city
      state <- city_list[row,]$state
      place_fips <- str_pad(city_list[row,]$stpl_fips, 7, side="left", pad="0")
      
      print(paste0("Getting demographics for ", city, " (", row, "/", nrow(city_list), ")"))
    
      # a. load ACS variables for city
      city_demos <- .load_acs_variables(state_abr=state_abr, year=year)
      
      # b. filter census tracts to be only those in city boundary
      city_str <- city
      city_tracts <- read_sf(paste(file_locations$current_fp, file_locations$Nation$Geographies$processed, sep="/")) %>%
          filter(city == city_str)
      
      city_demos <- city_demos %>%
        filter(!!sym(paste0("tract_", year)) %in% city_tracts$tract_2010) %>%
        mutate(year = year,
               city=city)
      
      city_demos_list <- append(city_demos_list, list(city_demos))
    
    }
    city_demos_df <- bind_rows(city_demos_list, .id = NULL) %>%
      distinct()
    
    write.csv(city_demos_df, output_fp, row.names = F)
    
    return(city_demos_df)
    
  } else {
    
    city_demos <- read.csv(output_fp)
    
    return(city_demos %>%
             mutate(!!sym(paste0("tract_", year)) := str_pad(!!sym(paste0("tract_", year)), 11, side="left", pad="0")))
    
  }
  
}

### LOAD ACS VARIABLES FOR A SPECIFIC YEAR
variables <- list("total_population"="B03002_001",
                  "total_population_white"="B03002_003",
                  "total_population_black"="B03002_004",
                  "total_population_hisp"="B03002_012",
                  "total_pov_responses"="B17001_001",
                  "below_pov_level"="B17001_002",
                  "total_population_asian"="B03002_006",
                  "total_population_pacisl"="B03002_007",
                  "total_population_otherrace"=c("B03002_008","B03002_009"))

.load_acs_variables <- function(state_abr, year) {
  #'
  #'@description Loads tract-level ACS data for given end year 
  #'and state.
  #'
  #'@param state_abr character. State abbreviation of city.
  #'@param year integer. End Year of 5-year ACS to load.
  #'
  #'@return dataframe of population values for given end year

  
  var_list <- list()
  for (var in names(variables)) {

    acs <- get_acs(geography="tract", variables=variables[[var]], state=state_abr, 
            year=year, cache_table=TRUE)
    
    acs <- acs %>% 
      select(-moe) %>% 
      spread(key=variable, value=estimate)
    
    acs[,var] <- rowSums(acs[,variables[[var]]])
    
    names(acs) <- tolower(names(acs))
    
    acs <- acs %>%
      select(geoid, !!sym(var))
    
    var_list <- append(var_list, list(acs))
  }
  
  var_df <- var_list %>% reduce(inner_join, by = "geoid")
  
  var_df <- var_df %>%
    rename(!!sym(paste0("tract_", year)) := geoid)

  return(var_df)
}



